
# Loading -----------------------------------------------------------------

# Set working directory
setwd("/Users/Elisabeth/Dropbox/Climate fairness/06 Empirics/05 DCP data/06 Replication NCC")

# Loading packages
pckgs <- c("haven", "stringr", "tm", "tidytext", "quanteda", "devtools",
           "ggplot2", "ggwordcloud", "ggrepel")
for(p in pckgs){
    if(!(p %in% rownames(installed.packages()))){
        install.packages(p)
    }
    library(p, character.only = T)
}
if(!("SpeedReader" %in% rownames(installed.packages()))){
    devtools::install_github("matthewjdenny/SpeedReader")
}
library(SpeedReader)

# Loading data
dt <- read_dta("02 Data/pooled_touse.dta",
               col_select = c("caseid", "Q4_4", "Q5", 
                              "ctry_us", "ctry_uk", "ctry_de", "ctry_fr"))
### Q4_4 is selection of a time path
### Q5 is text explanation

# Splitting data by country
dt.us <- subset(dt, ctry_us == 1)
dt.uk <- subset(dt, ctry_uk == 1)
dt.ge <- subset(dt, ctry_de == 1)
dt.fr <- subset(dt, ctry_fr == 1)
rm(dt)

# Cleaning text -----------------------------------------------------------

# create function that cleans text
cleantext <- function(dt, language){
    
    dt <- dt[,1:3]
    colnames(dt) <- c("caseid", "choice", "response")
    
    # eliminate accents
    dt$response <- iconv(dt$response, to="ASCII//TRANSLIT")
    
    # create document feature matrix
    featmat <- dfm(dt$response, 
                   tolower = TRUE, # remove capitalization
                   remove = stopwords(language),  # remove stop words
                   groups = dt$caseid)
    
    # apply word stemming
    featmat <- dfm_wordstem(featmat, language = language)
    
    # remove words of only 1 or 2 characters
    featmat <- dfm_select(featmat, min_nchar = 3)
    
    # remove words not used at least five times
    featmat <- dfm_trim(featmat, min_termfreq = 5)
    
    # transform to format needed
    dtm <- convert_quanteda_to_slam(featmat)
    
    # find the time path choices for texts present in the dtm
    usedtexts <- subset(dt, caseid %in% dtm$dimnames$Docs)
    choices <- data.frame(caseid = usedtexts$caseid,
                          choice = as.character(as_factor(usedtexts$choice)))
    
    return(list(dtm, choices))
    
}

# create all document term matrices
dtm.us <- cleantext(dt.us, "en")
dtm.uk <- cleantext(dt.uk, "en")
dtm.fr <- cleantext(dt.fr, "fr")
dtm.ge <- cleantext(dt.ge, "de")

# Finding distinguishing words --------------------------------------------

# calculating average number of words per answer for fightin words metaparameter
lapply(list(dtm.us[[1]], dtm.us[[1]], dtm.fr[[1]], dtm.ge[[1]]),
       function(x) mean(slam::rollup(x, 2, na.rm=TRUE, FUN = sum)))

# create function that performs analysis
distwords <- function(dtm){
    
    # create the contingengcy table
    contab <- contingency_table(dtm[[2]], dtm[[1]],
                                variables_to_use = "choice")
    
    # calculate pointwise mutual information
    pmires <- pmi(contab, display_top_x_terms = 20)
    
    # implement fightin words algorithm
    fwres <- feature_selection(contab, alpha = 5)
    
    return(list(pmires, fwres))
    
}

# perform analysis by country
words.us <- distwords(dtm.us)
words.uk <- distwords(dtm.uk)
words.fr <- distwords(dtm.fr)
words.ge <- distwords(dtm.ge)
allwords <- list(words.us, words.uk, words.fr, words.ge)

# Assemble plotting data --------------------------------------------------

# create function that extracts data needed for plots
make.pldat <- function(words, country){
    
    pmires <- words[[1]]
    fwres <- words[[2]]
    
    ## get relevant pmi results
    pmi.list <- list()
    for(i in 1:4){
        pmi.list[[i]] <- cbind(pmires$pmi_ranked_terms[[i]],
                               pmires$ranked_pmi[[i]])
        colnames(pmi.list[[i]]) <- c("word", "pmi")
    }
    names(pmi.list) <- pmires$pmi_table$dimnames[[1]]
    
    # get relevant fwres results
    fw.list <- list()
    for(i in names(fwres)[1:4]){
        fw.list[[i]] <- fwres[[i]][1:20,c("term", "z_scores")] ## subsetting to top 20 words here
        rownames(fw.list[[i]]) <- 1:nrow(fw.list[[i]])
        colnames(fw.list[[i]]) <- c("word", "lor")
    }
    names(fw.list) <- names(fwres)[1:4]
    
    # merging the two
    combo.list <- list()
    for(i in names(fw.list)){
        combo.list[[i]] <- merge(pmi.list[[i]], fw.list[[i]])
        combo.list[[i]] <- cbind(combo.list[[i]], i)
    }
    
    # unlist into a dataframe
    pldt <- do.call(rbind, combo.list)
    rownames(pldt) <- 1:nrow(pldt)
    
    # adding country name
    pldt <- cbind(pldt, country)
    colnames(pldt) <- c("word", "pmi", "lor", "option", "country")
    
    # adding total word frequency
    wordcounts <- cbind(fwres$Term_Ordering$terms,
                        fwres$Term_Ordering$total_count)
    colnames(wordcounts) <- c("word", "wcount")
    pldt <- merge(pldt, wordcounts)
    
    # setting correct column classes
    pldt$word <- as.character(pldt$word)
    pldt$pmi <- as.numeric(as.character(pldt$pmi))
    pldt$lor <- as.numeric(as.character(pldt$lor))
    pldt$option <- as.character(pldt$option)
    pldt$wcount <- as.numeric(as.character(pldt$wcount))
    
    return(pldt)
    
}

# extract and combine data needed for plot
pldat.us <- make.pldat(words.us, "United States")
pldat.uk <- make.pldat(words.uk, "United Kingdom")
pldat.fr <- make.pldat(words.fr, "France")
pldat.ge <- make.pldat(words.ge, "Germany")

pldat <- rbind(pldat.us, pldat.uk, pldat.fr, pldat.ge)

pldat$option <- factor(pldat$option,
                       levels = c("Costs remain the same over the entire period",
                                  "Costs are lower initially and higher later",
                                  "Costs are higher initially and  lower later",
                                  "Costs are lower initially, higher in the middle, and lower later"))

pldat$lwcount <- log(pldat$wcount)

# Create plot -------------------------------------------------------------

pl <- ggplot(pldat, aes(x = lwcount, y = lor))
pl <- pl + geom_point()
pl <- pl + geom_text_repel(aes(label = word),
                           show.legend = F)
pl <- pl + facet_grid(pldat$country ~ pldat$option, 
                      scales = "free", switch = "y")
pl <- pl + theme_bw()
pl <- pl + labs(title = "Top 20 Most Distinguishing Words by Time Path and Country",
                x = "Log(Word Count)",
                y = "Log Odds Ratio")
pl <- pl + theme(plot.title = element_text(size=20))

ggsave(pl, file = "03 Results/Bechteletal_Fig6.pdf",
       width = 18, height = 12)
ggsave(pl, file = "03 Results/Bechteletal_Fig6.tiff",
       width = 18, height = 12)
